library(dtwclust)
library(dtw)
df<-read.csv("../clean_data/gap_list_year.csv")
df
df <- subset(df, select = -c(X) )
df
colnames(df)[colnames(df) == "X2002"] = "2002"
colnames(df)[colnames(df) == "X2003"] = "2003"
colnames(df)[colnames(df) == "X2005"] = "2005"
colnames(df)[colnames(df) == "X2007"] = "2007"
colnames(df)[colnames(df) == "X2009"] = "2009"
colnames(df)[colnames(df) == "X2011"] = "2011"
colnames(df)[colnames(df) == "X2013"] = "2013"
colnames(df)[colnames(df) == "X2015"] = "2015"
colnames(df)[colnames(df) == "X2017"] = "2017"
colnames(df)[colnames(df) == "X2019"] = "2019"
colnames(df)[colnames(df) == "X2022"] = "2022"
df
jurisdiction = df[['Jurisdiction']]
dtw_df <- df[, -1]
dtw_df
df_lst <- tslist(dtw_df)
remove_nan <- function(ts) {
ts[!is.na(ts)]
}
# Apply the function to each time series in the list
df_lst <- lapply(df_lst, remove_nan)
head(df_lst)
$`1`
[1] -8 -7 -6 -6 -9 -5 -8 -8 -7 -10 -10
$`2`
[1] -7 -13 -8 -9 -9 -8 -6 -6 -8 -7 -13
$`3`
[1] -11 -6 -8 -8 -6 -7 -6 -9 -5 -5 -5
$`4`
[1] -6 -9 -8 -8 -11 -5 -8 -10 -9 -5 -9
$`5`
[1] -4 -7 -7 -9 -6 -5 -6 -6 -9 -9 -7
$`6`
[1] -7 -7 -6 -5 -7 -9 -8 -7 -6 -8 -4
df_cvi <- list()
for (i in 2:10){
df_clust <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw_basic", centroid = "pam")
df_metric <- cvi(df_clust, type = "valid", log.base = 10)
df_cvi <- append(df_cvi, list(df_metric))
}
df_cvi_ma <- do.call(rbind, df_cvi)
rw <- c("K2","K3","K4","K5","K6","K7","K8","K9","K10")
rownames(df_cvi_ma) <- rw
print(df_cvi_ma)
Sil SF CH DB DBstar D COP
K2 0.06625897 6.041834e-13 16.025157 2.360298 2.360298 0.08571429 0.5281549
K3 0.06553335 0.000000e+00 24.553137 2.468201 2.734273 0.09230769 0.4887729
K4 0.05828469 0.000000e+00 10.249424 1.668970 1.731410 0.11764706 0.4275639
K5 0.09511155 0.000000e+00 12.598315 1.551937 1.551937 0.15555556 0.3892899
K6 0.01946415 0.000000e+00 7.245536 1.944170 2.076265 0.14285714 0.3768423
K7 0.02021789 0.000000e+00 6.388889 1.399755 1.528309 0.17647059 0.3519877
K8 0.02208296 0.000000e+00 6.332946 1.569238 1.769674 0.25714286 0.3218029
K9 0.01721302 0.000000e+00 5.309659 1.431574 1.458588 0.23684211 0.3245167
K10 0.02815402 0.000000e+00 5.075740 1.305319 1.359985 0.17647059 0.2936453
– “Sil” (!): Silhouette index (Rousseeuw (1987); to be maximized).-K4 – “SF” (~): Score Function (Saitta et al. (2007); to be maximized; see notes). – “CH” (~): Calinski-Harabasz index (Arbelaitz et al. (2013); to be maximized).-k3 – “DB” (?): Davies-Bouldin index (Arbelaitz et al. (2013); to be minimized).k4 – “DBstar” (?): Modified Davies-Bouldin index (DB*) (Kim and Ramakrishna (2005); to be minimized). -k4 – “D” (!): Dunn index (Arbelaitz et al. (2013); to be maximized). k5 – “COP” (!): COP index (Arbelaitz et al. (2013); to be minimized). k9
df_cvi2 <- list()
for (i in 1:100){
df_clust2 <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw_basic", centroid = "pam", seed=i)
df_metric2 <- cvi(df_clust2, type = "valid", log.base = 10)
df_cvi2 <- append(df_cvi2, list(df_metric2))
}
df_cvi_ma2 <- do.call(rbind, df_cvi2)
rw2 <- as.character(seq(1, 100))
rownames(df_cvi_ma2) <- rw2
print(df_cvi_ma2)
Sil SF CH DB DBstar D COP
1 0.0100291529 0.000000e+00 10.763042 1.775357 2.008847 0.09230769 0.4759678
2 0.0841726422 0.000000e+00 14.959140 1.855387 1.859596 0.15909091 0.5110177
3 0.0600760529 0.000000e+00 9.453667 2.326232 2.406677 0.16981132 0.4444136
4 0.0496788759 0.000000e+00 9.910274 2.848242 2.879791 0.15000000 0.4235500
5 -0.0126611505 0.000000e+00 8.056536 1.488416 1.509573 0.17647059 0.4235189
6 0.0700124781 0.000000e+00 10.502069 2.151212 2.250850 0.13636364 0.3974040
7 0.0487342442 0.000000e+00 15.395473 2.328995 2.446345 0.26315789 0.4405351
8 0.0759120568 0.000000e+00 9.395599 1.877610 2.010928 0.16216216 0.3909585
9 -0.0041514049 0.000000e+00 9.667118 2.171054 2.340264 0.16981132 0.4326261
10 0.0678908426 0.000000e+00 12.822160 1.617566 1.641086 0.11320755 0.4194244
11 0.0520852267 0.000000e+00 10.278545 1.580213 1.768973 0.11320755 0.4282679
12 0.0254229410 0.000000e+00 7.458387 2.007001 2.035371 0.13846154 0.4866638
13 -0.0146930208 0.000000e+00 6.814201 2.947357 3.101729 0.10000000 0.5063010
14 0.0636263370 0.000000e+00 10.541846 2.061179 2.189366 0.15384615 0.3826893
15 0.0612873591 0.000000e+00 17.157730 2.086057 2.328448 0.12000000 0.4523979
16 -0.0037077513 0.000000e+00 14.784904 2.663252 2.720116 0.10769231 0.4816588
17 0.0500069837 0.000000e+00 10.377778 1.743756 1.806446 0.11764706 0.4128934
18 0.0402075317 0.000000e+00 14.517375 1.763010 1.869737 0.09523810 0.4645744
19 0.0750765235 0.000000e+00 15.064929 1.809762 1.992429 0.13953488 0.3968370
20 0.0645517230 0.000000e+00 10.379167 1.954194 2.148308 0.13953488 0.3910928
21 0.0392081114 0.000000e+00 9.137093 2.509584 2.657427 0.09230769 0.4351193
22 0.0781700848 0.000000e+00 9.870654 1.634688 1.635041 0.16216216 0.4208348
23 0.0565511300 0.000000e+00 11.378026 1.612674 1.805035 0.13636364 0.4135622
24 0.0532632395 0.000000e+00 9.530556 1.847981 2.022263 0.13953488 0.3994537
25 0.0719568777 0.000000e+00 10.704472 1.918323 2.205540 0.13953488 0.3924999
26 0.0633535542 0.000000e+00 10.516875 2.153644 2.306950 0.16216216 0.4226172
27 0.0443520455 0.000000e+00 11.237233 1.917937 2.196331 0.13953488 0.3891735
28 0.0531202735 0.000000e+00 11.155899 1.739711 1.758863 0.15000000 0.4017578
29 0.0530320978 0.000000e+00 14.820963 1.741738 1.888614 0.20930233 0.4055971
30 0.0734805064 0.000000e+00 8.430883 2.165030 2.192134 0.14285714 0.3960272
31 0.0247141682 0.000000e+00 10.095822 2.237583 2.463326 0.09230769 0.4735172
32 0.0659614818 0.000000e+00 14.943590 1.508612 1.636860 0.22727273 0.4639282
33 0.0779517186 0.000000e+00 10.598687 1.871760 1.983765 0.16216216 0.4112995
34 0.0640255858 0.000000e+00 11.629318 1.858690 2.050606 0.13953488 0.3880855
35 0.0393308929 0.000000e+00 15.207723 1.736991 1.872961 0.16071429 0.4474058
36 0.0666889894 0.000000e+00 16.209055 1.508144 1.542135 0.20408163 0.4418455
37 0.0504152998 0.000000e+00 10.964469 1.429309 1.567808 0.10714286 0.4129268
38 0.0131408732 0.000000e+00 12.557252 1.870953 1.988091 0.12500000 0.4709555
39 0.0655813747 0.000000e+00 17.948819 2.252317 2.433793 0.14285714 0.4343505
40 0.0244582256 0.000000e+00 13.997433 1.554684 1.676051 0.14285714 0.4792954
41 0.0652541443 0.000000e+00 11.895461 2.062778 2.222153 0.12765957 0.4417892
42 0.0570659359 0.000000e+00 9.744711 1.969459 2.004971 0.11764706 0.3873015
43 0.0117597265 0.000000e+00 9.006748 1.399974 1.522136 0.10714286 0.4288147
44 0.0446080001 0.000000e+00 10.371813 1.910604 2.110348 0.13953488 0.4008733
45 0.0429769845 0.000000e+00 14.371725 1.861356 1.908337 0.15555556 0.4085027
46 0.0491078530 0.000000e+00 13.404178 1.919565 2.060959 0.20454545 0.3981291
47 0.0600990707 0.000000e+00 8.479720 1.613841 1.624124 0.11764706 0.4002724
48 -0.0025074645 0.000000e+00 14.100000 2.124761 2.188914 0.11111111 0.4877645
49 0.0701503462 0.000000e+00 17.561141 1.915211 2.019232 0.15384615 0.4335640
50 0.0296440094 0.000000e+00 9.210623 2.069802 2.193130 0.09230769 0.4613271
51 0.0730203569 0.000000e+00 16.718121 1.768872 1.853167 0.14285714 0.4301929
52 0.0343083510 0.000000e+00 10.430078 1.990142 2.026127 0.13636364 0.4419429
53 -0.0008753423 0.000000e+00 8.093404 1.804325 1.908250 0.09230769 0.5050032
54 0.0328432825 0.000000e+00 9.750583 1.199231 1.209350 0.15789474 0.4042262
55 0.0880796126 0.000000e+00 15.709181 1.903788 1.912888 0.16279070 0.3889857
56 0.0574090037 0.000000e+00 13.231847 1.925360 2.044353 0.17647059 0.4504834
57 0.0690375243 0.000000e+00 14.913663 1.709613 1.785029 0.18000000 0.4513889
58 0.0780296416 0.000000e+00 15.130702 1.946452 2.032804 0.15384615 0.4638353
59 0.0317466841 0.000000e+00 8.442142 1.802683 1.831633 0.15384615 0.4772134
60 0.0259975424 0.000000e+00 9.767622 2.097017 2.270671 0.09230769 0.4837363
61 0.0387953470 0.000000e+00 5.882740 1.439950 1.439950 0.24324324 0.4295378
62 0.0923346826 0.000000e+00 15.413638 1.613849 1.771349 0.23255814 0.4840881
63 0.0315490085 0.000000e+00 10.343462 2.139230 2.323623 0.09230769 0.4703122
64 0.0908976413 0.000000e+00 16.975741 1.523828 1.547637 0.23255814 0.4214494
65 0.0528857567 0.000000e+00 8.928087 1.772845 1.773438 0.14285714 0.4330073
66 0.0740787851 0.000000e+00 16.067295 1.721329 1.839521 0.26315789 0.4491792
67 0.0255768820 0.000000e+00 10.564903 2.378261 2.549689 0.11320755 0.4168934
68 0.1127901082 0.000000e+00 14.381510 1.295255 1.371106 0.20408163 0.4304676
69 0.0363249450 0.000000e+00 11.014273 2.176107 2.266411 0.09230769 0.4327640
70 0.0446041689 0.000000e+00 11.405678 1.982478 2.115848 0.16216216 0.3995312
71 0.0541648005 0.000000e+00 9.840220 1.745620 1.778899 0.21428571 0.4055178
72 0.0392706115 0.000000e+00 9.135332 1.520537 1.548328 0.17647059 0.4125765
73 0.0500069837 0.000000e+00 10.377778 1.743756 1.806446 0.11764706 0.4128934
74 0.0236143515 0.000000e+00 7.240051 1.785420 1.889133 0.16981132 0.4562057
75 0.0419395302 0.000000e+00 9.022222 1.730745 1.801963 0.11764706 0.3968444
76 0.0426626234 0.000000e+00 15.970677 2.192091 2.531687 0.09523810 0.4631893
77 0.0827279015 0.000000e+00 15.918340 1.564605 1.572128 0.20000000 0.4342921
78 0.0833365716 0.000000e+00 17.264459 1.824499 1.878477 0.18750000 0.4398688
79 0.0525624018 0.000000e+00 16.412698 1.790103 1.873185 0.13333333 0.3982825
80 -0.0028311091 0.000000e+00 8.748098 2.425672 2.528851 0.14285714 0.4720661
81 0.0076063231 0.000000e+00 15.360278 1.754127 1.960775 0.14285714 0.4939960
82 0.0693734637 0.000000e+00 9.434622 1.882491 1.924932 0.21428571 0.4023527
83 0.0963751063 0.000000e+00 15.793011 1.871982 2.002971 0.23255814 0.4204390
84 0.0662357816 0.000000e+00 9.118747 1.617927 1.675620 0.17142857 0.3987197
85 0.0653312685 0.000000e+00 16.899593 1.863830 1.980385 0.15555556 0.4192861
86 0.0316797832 6.661338e-16 7.371285 1.518346 1.536835 0.11764706 0.4067188
87 0.0648319245 0.000000e+00 10.634078 1.802736 1.907256 0.14285714 0.3876756
88 0.0287269808 0.000000e+00 3.720833 2.794126 2.808265 0.14634146 0.5287727
89 0.0356484279 0.000000e+00 15.459982 2.027381 2.171577 0.12500000 0.4257889
90 0.0593205408 0.000000e+00 15.382924 1.501254 1.538430 0.21951220 0.4461765
91 0.0634798555 0.000000e+00 10.671816 1.664219 1.727520 0.14285714 0.3880272
92 0.0513376330 0.000000e+00 15.352075 2.154485 2.316881 0.12500000 0.4388253
93 0.1170520725 0.000000e+00 16.434014 1.751573 1.876421 0.15555556 0.4096251
94 0.0828138158 0.000000e+00 15.149473 1.871655 2.004138 0.16216216 0.4155054
95 0.0259419479 0.000000e+00 9.247978 1.460143 1.503397 0.16981132 0.4389056
96 0.0333847451 0.000000e+00 7.679336 1.942389 2.053784 0.16981132 0.4281659
97 0.0414947781 0.000000e+00 9.718127 1.372342 1.372342 0.16216216 0.4034727
98 0.0649727576 0.000000e+00 11.590248 1.969242 2.198316 0.13953488 0.3880528
99 0.0347497202 0.000000e+00 15.497297 1.738330 1.922942 0.14000000 0.4343026
100 0.0578968839 0.000000e+00 14.550685 2.214026 2.424709 0.14893617 0.4253477
for (i in 2:10){df_clust_opt <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt)}
# for (i in 1:10){df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
# plot(df_clust_opt_final)}
# for (i in 11:20){df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
# plot(df_clust_opt_final)}
#k4
df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt_final)
# Extract cluster assignments
cluster_assignments <- df_clust_opt_final@cluster
# Determine the number of clusters
num_clusters <- max(cluster_assignments)
# Loop through each cluster and print the jurisdictions in it
for (cluster_number in 1:num_clusters) {
cat("Jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Print the jurisdictions corresponding to these indices
print(jurisdiction[indices_in_cluster])
cat("\n") # Add a newline for readability
}
Jurisdictions in Cluster 1 :
[1] "Hawaii" "North Carolina" "South Carolina" "Tennessee"
Jurisdictions in Cluster 2 :
[1] "Arizona" "Connecticut" "Maryland" "Michigan" "Missouri" "Montana" "Nevada" "North Dakota" "Pennsylvania"
[10] "Vermont" "Wisconsin"
Jurisdictions in Cluster 3 :
[1] "Alabama" "Alaska" "California" "Florida" "Illinois" "Indiana" "Kentucky" "Louisiana" "Maine"
[10] "Massachusetts" "Nebraska" "New York" "Ohio" "Oklahoma" "Oregon" "South Dakota" "Texas" "Wyoming"
Jurisdictions in Cluster 4 :
[1] "Arkansas" "Colorado" "Delaware" "Georgia" "Idaho" "Iowa" "Kansas" "Minnesota" "Mississippi"
[10] "National" "New Hampshire" "New Jersey" "New Mexico" "Rhode Island" "Utah" "Virginia" "Washington" "West Virginia"
# Create an empty dataframe to store the results
jurisdiction_clusters <- data.frame(Jurisdiction = character(), G4_Cluster = numeric(), stringsAsFactors = FALSE)
# Loop through each cluster and append the jurisdictions and their cluster number to the dataframe
for (cluster_number in 1:num_clusters) {
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Extract the jurisdictions corresponding to these indices
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Create a temporary dataframe for this cluster
temp_df <- data.frame(Jurisdiction = jurisdictions_in_cluster, G4_Cluster = rep(cluster_number, length(jurisdictions_in_cluster)), stringsAsFactors = FALSE)
# Append the temporary dataframe to the main dataframe
jurisdiction_clusters <- rbind(jurisdiction_clusters, temp_df)
}
# Export the dataframe to a CSV file
write.csv(jurisdiction_clusters, "../clean_data/jurisdiction_clusters_G4.csv", row.names = FALSE)
# View the resulting dataframe
print(jurisdiction_clusters)
# Load necessary libraries
library(ggplot2)
library(reshape2)
# Loop through each cluster
for (cluster_number in 1:num_clusters) {
cat("Plotting jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Get the names of the jurisdictions in this cluster
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Filter the gap_list_year data frame for these jurisdictions
cluster_data <- df[df$Jurisdiction %in% jurisdictions_in_cluster, ]
# Convert the data to long format for ggplot
long_df <- melt(cluster_data, id.vars = "Jurisdiction", variable.name = "Year", value.name = "Value")
# Plot
p <- ggplot(long_df, aes(x = Year, y = Value, group = Jurisdiction, color = Jurisdiction)) +
geom_line() +
labs(title = paste("Cluster", cluster_number), x = "Year", y = "Gap") +
theme(legend.position = "right")
print(p)
#ggsave(paste("cluster_", cluster_number, ".png", sep=""), plot = p)
}
Plotting jurisdictions in Cluster 1 :
Plotting jurisdictions in Cluster 2 :
Plotting jurisdictions in Cluster 3 :
Plotting jurisdictions in Cluster 4 :